2016-07-16

Outline

  • dplyr package: motivation, functions, chaining
  • purrr: working with lists, vectors of data frames

dplyr verbs

There are five primary dplyr verbs, representing distinct data analysis tasks:

  • Filter: Remove the rows of a data frame, producing subsets
  • Arrange: Reorder the rows of a data frame
  • Select: Select particular columns of a data frame
  • Mutate: Add new columns that are functions of existing columns
  • Summarise: Create collapsed summaries of a data frame
  • (Group By: Introduce structure to a data frame)

Filter

data(french_fries, package = "reshape2")
french_fries %>%
    filter(subject == 3, time == 1)
#>   time treatment subject rep potato buttery grassy rancid painty
#> 1    1         1       3   1    2.9     0.0    0.0    0.0    5.5
#> 2    1         1       3   2   14.0     0.0    0.0    1.1    0.0
#> 3    1         2       3   1   13.9     0.0    0.0    3.9    0.0
#> 4    1         2       3   2   13.4     0.1    0.0    1.5    0.0
#> 5    1         3       3   1   14.1     0.0    0.0    1.1    0.0
#> 6    1         3       3   2    9.5     0.0    0.6    2.8    0.0

Arrange

french_fries %>%
    arrange(desc(rancid)) %>%
    head
#>   time treatment subject rep potato buttery grassy rancid painty
#> 1    9         2      51   1    7.3     2.3      0   14.9    0.1
#> 2   10         1      86   2    0.7     0.0      0   14.3   13.1
#> 3    5         2      63   1    4.4     0.0      0   13.8    0.6
#> 4    9         2      63   1    1.8     0.0      0   13.7   12.3
#> 5    5         2      19   2    5.5     4.7      0   13.4    4.6
#> 6    4         3      63   1    5.6     0.0      0   13.3    4.4

Select

french_fries %>%
    select(time, treatment, subject, rep, potato) %>%
    head
#>    time treatment subject rep potato
#> 61    1         1       3   1    2.9
#> 25    1         1       3   2   14.0
#> 62    1         1      10   1   11.0
#> 26    1         1      10   2    9.9
#> 63    1         1      15   1    1.2
#> 27    1         1      15   2    8.8

Summarise

french_fries %>%
    summarise(mean_rancid = mean(rancid, na.rm=TRUE), 
              sd_rancid = sd(rancid, na.rm = TRUE))
#>   mean_rancid sd_rancid
#> 1     3.85223  3.781815

Summarise and Group_by

french_fries %>%
    group_by(time, treatment) %>%
    summarise(mean_rancid = mean(rancid), sd_rancid = sd(rancid))
#> Source: local data frame [30 x 4]
#> Groups: time [?]
#> 
#>      time treatment mean_rancid sd_rancid
#>    <fctr>    <fctr>       <dbl>     <dbl>
#> 1       1         1    2.758333  3.212870
#> 2       1         2    1.716667  2.714801
#> 3       1         3    2.600000  3.202037
#> 4       2         1    3.900000  4.374730
#> 5       2         2    2.141667  3.117540
#> 6       2         3    2.495833  3.378767
#> 7       3         1    4.650000  3.933358
#> 8       3         2    2.895833  3.773532
#> 9       3         3    3.600000  3.592867
#> 10      4         1    2.079167  2.394737
#> # ... with 20 more rows

Let's use these tools

to answer these french fry experiment questions:

  • Is the design complete?
  • Are replicates like each other?
  • How do the ratings on the different scales differ?
  • Are raters giving different scores on average?
  • Do ratings change over the weeks?

Completeness

If the data is complete it should be 12 x 10 x 3 x 2, that is, 6 records for each person. (Assuming that each person rated on all scales.)

To check this we want to tabulate the number of records for each subject, time and treatment. This means select appropriate columns, tabulate, count and spread it out to give a nice table.

french_fries %>% 
  select(subject, time, treatment) %>% 
  tbl_df() %>% 
  count(subject, time) %>%
  spread(time, n)
#> Source: local data frame [12 x 11]
#> Groups: subject [12]
#> 
#>    subject     1     2     3     4     5     6     7     8     9    10
#> *   <fctr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1        3     6     6     6     6     6     6     6     6     6    NA
#> 2       10     6     6     6     6     6     6     6     6     6     6
#> 3       15     6     6     6     6     6     6     6     6     6     6
#> 4       16     6     6     6     6     6     6     6     6     6     6
#> 5       19     6     6     6     6     6     6     6     6     6     6
#> 6       31     6     6     6     6     6     6     6     6    NA     6
#> 7       51     6     6     6     6     6     6     6     6     6     6
#> 8       52     6     6     6     6     6     6     6     6     6     6
#> 9       63     6     6     6     6     6     6     6     6     6     6
#> 10      78     6     6     6     6     6     6     6     6     6     6
#> 11      79     6     6     6     6     6     6     6     6     6    NA
#> 12      86     6     6     6     6     6     6     6     6    NA     6

Check completeness with different scales, too

french_fries %>% 
  gather(type, rating, -subject, -time, -treatment, -rep) %>%
#  select(subject, time, treatment, type) %>% 
#  tbl_df() %>% 
  count(subject, time) %>%
  spread(time, n)
#> Source: local data frame [12 x 11]
#> Groups: subject [12]
#> 
#>    subject     1     2     3     4     5     6     7     8     9    10
#> *   <fctr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1        3    30    30    30    30    30    30    30    30    30    NA
#> 2       10    30    30    30    30    30    30    30    30    30    30
#> 3       15    30    30    30    30    30    30    30    30    30    30
#> 4       16    30    30    30    30    30    30    30    30    30    30
#> 5       19    30    30    30    30    30    30    30    30    30    30
#> 6       31    30    30    30    30    30    30    30    30    NA    30
#> 7       51    30    30    30    30    30    30    30    30    30    30
#> 8       52    30    30    30    30    30    30    30    30    30    30
#> 9       63    30    30    30    30    30    30    30    30    30    30
#> 10      78    30    30    30    30    30    30    30    30    30    30
#> 11      79    30    30    30    30    30    30    30    30    30    NA
#> 12      86    30    30    30    30    30    30    30    30    NA    30

Change in ratings over weeks, relative to experimental design

ff.m <- french_fries %>% 
  gather(type, rating, -subject, -time, -treatment, -rep)
ggplot(data=ff.m, aes(x=time, y=rating, colour=treatment)) +
  geom_point() +
  facet_grid(subject~type) 

Add means over reps, and connect the dots

ff.m.av <- ff.m %>% 
  group_by(subject, time, type, treatment) %>%
  summarise(rating=mean(rating))
ggplot(data=ff.m, aes(x=time, y=rating, colour=treatment)) + 
  facet_grid(subject~type) +
  geom_line(data=ff.m.av, aes(group=treatment))

Your turn

Write an answer to each of the questions:

  • Is the design complete?
  • Are replicates like each other?
  • How do the ratings on the different scales differ?
  • Are raters giving different scores on average?
  • Do ratings change over the weeks?
ffp <- french_fries %>% select(time, treatment, subject, rep, potato) %>% spread(rep, potato)

ggplot(ffp, aes(x = `1`, y=`2`)) + geom_point()

ff_all <- ff.m %>% spread(rep, rating)
ggplot(ff_all, aes(x = `1`, y=`2`)) + geom_point(aes(colour=type))

Working with lots of models

Why would we even do that???

  • Hans Rosling can explain that really well in his TED talk

  • gapminder project

  • gapminder also makes data available (in R through the package gapminder)

First Look

library(gapminder)
ggplot(data=gapminder, aes(x=year, y=lifeExp, group=country)) + 
  geom_line()

How would you describe this plot?

First Look

library(gapminder)
ggplot(data=gapminder, aes(x=year, y=lifeExp, group=country, colour=continent)) + 
  geom_line()

How about now?

Using models as exploratory tools

  • Idea: fit a line to each one of the countries' life expectancies
  • then use e.g. intercept and slope to characterise groups of countries

Prepping

  • purrr package has been unveiled in February (presented by Hadley Wickham at WOMBAT)
  • you might have to re-install the packages tidyr and purrr, i.e. run the following code:
install.packages("tidyr")
install.packages("purrr")

library(tidyr)
library(purrr)

Life Expectancy in the U.S.

usa <- gapminder %>% filter(country=="United States")
head(usa)
#> # A tibble: 6 x 6
#>         country continent  year lifeExp       pop gdpPercap
#>          <fctr>    <fctr> <int>   <dbl>     <int>     <dbl>
#> 1 United States  Americas  1952   68.44 157553000  13990.48
#> 2 United States  Americas  1957   69.49 171984000  14847.13
#> 3 United States  Americas  1962   70.21 186538000  16173.15
#> 4 United States  Americas  1967   70.76 198712000  19530.37
#> 5 United States  Americas  1972   71.34 209896000  21806.04
#> 6 United States  Americas  1977   73.38 220239000  24072.63

Life Expectancy in the U.S. since 1950

ggplot(data=usa, aes(x=year, y=lifeExp)) + 
  geom_point() +
  geom_smooth(method="lm")

Life Expectancy in the U.S. since 1950

usa.lm <- lm(lifeExp~year, data=usa)
usa.lm
#> 
#> Call:
#> lm(formula = lifeExp ~ year, data = usa)
#> 
#> Coefficients:
#> (Intercept)         year  
#>   -291.0845       0.1842

Intercept is estimated life expectancy at 0 BC - let's use 1950 for the first value:

gapminder <- gapminder %>% mutate(year = year-1950)

Nesting data

We don't want to subset the data for every country …

nest() makes a data frame part of another data frame:

by_country <- gapminder %>% group_by(continent, country) %>% nest()
head(by_country)
#> # A tibble: 6 x 3
#>   continent     country              data
#>      <fctr>      <fctr>            <list>
#> 1      Asia Afghanistan <tibble [12 x 4]>
#> 2    Europe     Albania <tibble [12 x 4]>
#> 3    Africa     Algeria <tibble [12 x 4]>
#> 4    Africa      Angola <tibble [12 x 4]>
#> 5  Americas   Argentina <tibble [12 x 4]>
#> 6   Oceania   Australia <tibble [12 x 4]>

Each element of the data variable in by_country is a dataset:

head(by_country$data[[1]])
#> # A tibble: 6 x 4
#>    year lifeExp      pop gdpPercap
#>   <dbl>   <dbl>    <int>     <dbl>
#> 1     2  28.801  8425333  779.4453
#> 2     7  30.332  9240934  820.8530
#> 3    12  31.997 10267083  853.1007
#> 4    17  34.020 11537966  836.1971
#> 5    22  36.088 13079460  739.9811
#> 6    27  38.438 14880372  786.1134
lm(lifeExp~year, data=by_country$data[[1]])
#> 
#> Call:
#> lm(formula = lifeExp ~ year, data = by_country$data[[1]])
#> 
#> Coefficients:
#> (Intercept)         year  
#>     29.3566       0.2753

Fitting multiple models

Now we are using the map function in the package purrr.

map allows us to apply a function to each element of a list.

by_country$model <- by_country$data %>% purrr::map(~lm(lifeExp~year, data=.))
head(by_country)
#> # A tibble: 6 x 4
#>   continent     country              data    model
#>      <fctr>      <fctr>            <list>   <list>
#> 1      Asia Afghanistan <tibble [12 x 4]> <S3: lm>
#> 2    Europe     Albania <tibble [12 x 4]> <S3: lm>
#> 3    Africa     Algeria <tibble [12 x 4]> <S3: lm>
#> 4    Africa      Angola <tibble [12 x 4]> <S3: lm>
#> 5  Americas   Argentina <tibble [12 x 4]> <S3: lm>
#> 6   Oceania   Australia <tibble [12 x 4]> <S3: lm>

On to the broom package

broom allows to extract values from models on three levels:

  • for each model: broom::glance
  • for each coefficient in the model: broom::tidy
  • for each value in the dataset: broom::augment
library(broom)
broom::glance(by_country$model[[1]])
#>   r.squared adj.r.squared    sigma statistic      p.value df    logLik
#> 1 0.9477123     0.9424835 1.222788  181.2494 9.835213e-08  2 -18.34693
#>        AIC      BIC deviance df.residual
#> 1 42.69387 44.14859  14.9521          10
broom::tidy(by_country$model[[1]])
#>          term   estimate  std.error statistic      p.value
#> 1 (Intercept) 29.3566375 0.69898128  41.99918 1.404235e-12
#> 2        year  0.2753287 0.02045093  13.46289 9.835213e-08

broom::augment(by_country$model[[1]])
#>    lifeExp year  .fitted   .se.fit      .resid       .hat   .sigma
#> 1   28.801    2 29.90729 0.6639995 -1.10629487 0.29487179 1.211813
#> 2   30.332    7 31.28394 0.5799442 -0.95193823 0.22494172 1.237512
#> 3   31.997   12 32.66058 0.5026799 -0.66358159 0.16899767 1.265886
#> 4   34.020   17 34.03722 0.4358337 -0.01722494 0.12703963 1.288917
#> 5   36.088   22 35.41387 0.3848726  0.67413170 0.09906760 1.267003
#> 6   38.438   27 36.79051 0.3566719  1.64748834 0.08508159 1.154002
#> 7   39.854   32 38.16716 0.3566719  1.68684499 0.08508159 1.147076
#> 8   40.822   37 39.54380 0.3848726  1.27820163 0.09906760 1.208243
#> 9   41.674   42 40.92044 0.4358337  0.75355828 0.12703963 1.260583
#> 10  41.763   47 42.29709 0.5026799 -0.53408508 0.16899767 1.274051
#> 11  42.129   52 43.67373 0.5799442 -1.54472844 0.22494172 1.148593
#> 12  43.828   57 45.05037 0.6639995 -1.22237179 0.29487179 1.194109
#>         .cooksd  .std.resid
#> 1  2.427205e-01 -1.07742164
#> 2  1.134714e-01 -0.88428127
#> 3  3.603567e-02 -0.59530844
#> 4  1.653992e-05 -0.01507681
#> 5  1.854831e-02  0.58082792
#> 6  9.225358e-02  1.40857509
#> 7  9.671389e-02  1.44222437
#> 8  6.668277e-02  1.10129103
#> 9  3.165567e-02  0.65958143
#> 10 2.334344e-02 -0.47913530
#> 11 2.987950e-01 -1.43494020
#> 12 2.963271e-01 -1.19046907

Extract values for each coefficient

Extract all countries automatically (hello map again!)

by_country$stats <- by_country$model %>% purrr::map(broom::tidy)
by_country_coefs <- by_country %>% unnest(stats)
coefs <- by_country_coefs %>% select(country, continent, term, estimate) %>% spread(term, estimate)

and finally, a visualization:

ggplot(data=coefs, aes(x=`(Intercept)`, y=year, colour=continent)) +
  geom_point()

Extract other model diagnostics

by_country <- by_country %>% unnest(model %>% purrr::map(broom::glance))
by_country$country <- reorder(by_country$country, by_country$r.squared)
ggplot(data=by_country, aes(x=country, y=r.squared, colour=continent)) +
  geom_point() +
  theme(axis.text.x=element_text(angle=-90, hjust=0)) +
  scale_colour_brewer(palette="Dark2")

Compare groups of countries

country_all <- by_country %>% unnest(data)
ggplot(data=subset(country_all, r.squared <= 0.45), 
       aes(x=year+1950, y=lifeExp)) +
  geom_line() +
  facet_wrap(~country)

What do the patterns mean?

ggplot(data=subset(country_all, between(r.squared, 0.45, 0.75)), 
       aes(x=year+1950, y=lifeExp)) +
  geom_line() +
  facet_wrap(~country)

Your turn

  • extract residuals for each of the models and store it in a dataset together with country and continent information

  • plot residuals across the years and fit a smooth. What does the pattern mean?

broom for bio data

biobroom package (Bioconductor; Bass, Nelson, Robinson, Storey, 2016) has the same functions as broom, i.e. glance, tidy, and augment.

These functions provide information depending on the input type

library(biobroom)
data(hammer)

counts <- Biobase::exprs(hammer)
head(counts)
#>                    SRX020102 SRX020103 SRX020104 SRX020105 SRX020091-3
#> ENSRNOG00000000001         2         4        18        24           7
#> ENSRNOG00000000007         4         1         3         1           5
#> ENSRNOG00000000008         0         1         4         2           0
#> ENSRNOG00000000009         0         0         0         0           0
#> ENSRNOG00000000010        19        10        19        13          50
#> ENSRNOG00000000012         7         5         1         0          31
#>                    SRX020088-90 SRX020094-7 SRX020098-101
#> ENSRNOG00000000001            4          93            77
#> ENSRNOG00000000007            4           9             4
#> ENSRNOG00000000008            5           2             6
#> ENSRNOG00000000009            0           0             0
#> ENSRNOG00000000010           57          45            58
#> ENSRNOG00000000012           26          12             9

The Hammer study

published as part of the biobroom package, part of the ReCount project

# more information about the study:
Biobase::phenoData(hammer)@data
#>                   sample.id num.tech.reps protocol         strain     Time
#> SRX020102         SRX020102             1  control Sprague Dawley 2 months
#> SRX020103         SRX020103             2  control Sprague Dawley 2 months
#> SRX020104         SRX020104             1   L5 SNL Sprague Dawley 2 months
#> SRX020105         SRX020105             2   L5 SNL Sprague Dawley  2months
#> SRX020091-3     SRX020091-3             1  control Sprague Dawley  2 weeks
#> SRX020088-90   SRX020088-90             2  control Sprague Dawley  2 weeks
#> SRX020094-7     SRX020094-7             1   L5 SNL Sprague Dawley  2 weeks
#> SRX020098-101 SRX020098-101             2   L5 SNL Sprague Dawley  2 weeks

biobroom and edgeR

identify differentially expressed genes following the protocol by Robinson, McCarthy and Smyth 2009

library(edgeR)
y <- DGEList(counts = counts, group=Biobase::phenoData(hammer)@data$protocol)
y <- calcNormFactors(y)
y <- estimateCommonDisp(y)
y <- estimateTagwiseDisp(y)
et <- exactTest(y)

glance(et, alpha = 0.05) # glance on DGEExact
#>   significant     comparison
#> 1        6993 control/L5 SNL

biobroom and edgeR

tet <- tidy(et)
tet$significant <- tet$p.value < 0.05
ggplot(data=tet, aes(x=logCPM, y=estimate, colour=significant)) +
  geom_point(alpha=.1) + facet_wrap(~significant, labeller="label_both")

augment(y) stops in an error (this is a bug and reported)

biobroom and limma

bb <- data.frame(read_tsv("../data/biotin-rma2.txt"))
head(data.frame(bb[,-2]))
#>         Gene biotin.WT.01.1 biotin.bio101.4 biotin.WT.B1.2 biotin.bio1B1.3
#> 1   11986_at       6.359180        6.004075       6.325338        6.255888
#> 2   11987_at       7.833549        6.666034       7.738628        7.691321
#> 3   11988_at       6.145599        6.128749       6.258157        6.353912
#> 4   11989_at       5.675039        5.078695       5.444967        5.380756
#> 5   11990_at       5.382078        5.241411       5.209638        5.357740
#> 6 11991_g_at       5.389809        5.087797       5.403933        5.592901
#>   biotin.WT.02.1 biotin.bio102.4 biotin.WT.B2.2 biotin.bio1B2.3
#> 1       6.554727        6.144557       6.098169        6.201624
#> 2       7.461711        7.300383       7.353033        7.298990
#> 3       6.097703        6.183593       6.184723        6.276867
#> 4       5.258524        5.328962       5.460898        5.203674
#> 5       5.337013        5.276889       5.434531        5.362002
#> 6       5.571362        5.249382       5.596683        5.140585
row.names(bb) <- bb$Gene

Looking at the gene expression data

ggpairs(bb, columns=c(3,7,4,8))

A porcupine plot again

sub <- bb %>% select(Gene, biotin.WT.01.1, biotin.WT.02.1, biotin.bio101.4, biotin.bio102.4)
ggplot(sub, aes(x=biotin.WT.01.1, xend=biotin.WT.02.1, y=biotin.bio101.4, yend=biotin.bio102.4)) +
  geom_segment() +
  theme(aspect.ratio = 1) +
  xlab("wildtype, control treatment") +
  ylab("mutant, treated")

Fit a limma model

design <- expand.grid(type=c("wild", "mutant"), trt=c("control", "treatment"), rep=1:2)

fit <- lmFit(bb[,-(1:2)], model.matrix(~ type*trt, design))
fit <- eBayes(fit)

head(topTable(fit))
#>            typemutant trttreatment typemutant.trttreatment  AveExpr
#> 13212_s_at   3.436575    0.1501056               -3.282545 6.383540
#> 13449_at     2.498316    0.1163889               -2.488131 5.287509
#> 14609_at     2.301786   -0.2724221               -1.962619 5.406537
#> 16016_at    -3.749019    1.2864088                3.338380 7.858121
#> 18255_at     1.696560   -0.3977035               -1.326075 6.844090
#> 15162_at     2.699042    0.1022273               -2.354713 8.408859
#>                    F      P.Value    adj.P.Val
#> 13212_s_at 297.14511 8.048043e-08 0.0006677461
#> 13449_at   154.05666 8.051126e-07 0.0033400095
#> 14609_at   129.24605 1.484583e-06 0.0041058604
#> 16016_at   108.19217 2.752579e-06 0.0048917219
#> 18255_at   106.07250 2.947886e-06 0.0048917219
#> 15162_at    91.58383 4.898100e-06 0.0067732558

Your Turn

For the previous example, try out what output the different broom functions (glance, tidy, augment) produce. Create a Volcano plot for each of the model terms, i.e. plot estimates on x by log(p.values) on y. Are there differences visible between the terms?

bbfit <- tidy(fit)
ggplot(data=bbfit, aes(x=term, y=estimate, group=gene)) +
  geom_line(alpha=0.1) +
  geom_point(aes(color=log(p.value)), size=2, alpha=0.6)

Is type*treatment interaction necessary? Very strong negative correlation is suspicious.

bbfit_m <- bbfit %>% select(gene, term, estimate, p.value) %>% 
  gather(fit.stat, value, -gene, -term) %>%
  unite(term_stat, term, fit.stat) %>%
  spread(term_stat, value) %>% 
  rename(trt=trttreatment_estimate, mut=typemutant_estimate,
         int=`typemutant:trttreatment_estimate`, 
         trtp=trttreatment_p.value, mutp=typemutant_p.value, 
         intp=`typemutant:trttreatment_p.value`) 
ggpairs(bbfit_m, columns=c(2,4,6), upper=list(continuous="points"),
          ggplot2::aes(colour=intp)) + theme(aspect.ratio=1)

fit2 <- lmFit(bb[,-(1:2)], model.matrix(~ type+trt, design))
fit2 <- eBayes(fit2)
bbfit2 <- tidy(fit2)
ggplot(data=bbfit2, aes(x=term, y=estimate, group=gene)) +
  geom_line(alpha=0.1) +
  geom_point(aes(color=log(p.value)), size=2, alpha=0.6)

References

Share and share alike

This work is licensed under the Creative Commons Attribution-Noncommercial 3.0 United States License. To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/ 3.0/us/ or send a letter to Creative Commons, 171 Second Street, Suite 300, San Francisco, California, 94105, USA.